In [1]:
%%bash
ls | grep .csv
In [2]:
# %%bash
# pip3 install bokeh
In [3]:
# built-in libs
import email
# processing libs
import pandas as pd
# display libs
from tqdm import tqdm_notebook
In [4]:
emails_full_df = pd.read_csv('emails.csv', chunksize=10000)
emails_df = next(emails_full_df)
In [5]:
print(emails_df.shape)
emails_df.head()
Out[5]:
In [6]:
emails_df.info()
In [7]:
%time
messages_obj_lst = []
messages_str_lst = []
message_metadata = {}
for i in tqdm_notebook(range(emails_df.shape[0])):
msg = email.message_from_string(emails_df.message[i])
for msg_property in msg:
if msg_property in message_metadata:
message_metadata[msg_property][i] = msg[msg_property]
else:
message_metadata[msg_property] = ['N/A'] * emails_df.shape[0]
payload = msg.get_payload() # decode=True
messages_obj_lst.append(msg)
messages_str_lst.append(payload) #.encode('utf-8').decode('unicode_escape')
#except KeyboardInterrupt:
# break
print('messages_obj_lst size: %i' % len(messages_obj_lst))
In [8]:
# update dataframe object
# emails_df.rename(columns = {'message':'message_obj'}, inplace = True)
emails_df = emails_df.assign(message_obj = pd.Series(messages_obj_lst).values)
emails_df = emails_df.assign(payload = pd.Series(messages_str_lst).values)
# print(emails_df.payload.str.contains(r'\\'))
emails_df['payload'] = emails_df.payload.str.replace(r'\n', '')
In [9]:
emails_df.head()
Out[9]:
In [10]:
# del messages_obj_lst
# del messages_str_lst
emails_df.drop('message', axis=1, inplace=True)
In [ ]:
In [11]:
corpus_text = '\n'.join(emails_df[:50000]['payload'])
sentences = corpus_text.split('\n')
sentences = [line.lower().split(' ') for line in sentences]
In [12]:
def clean(s):
return [w.strip(',."!?:;()\'') for w in s]
sentences = [clean(s) for s in sentences if len(s) > 0]
In [ ]:
In [13]:
from gensim.models import Word2Vec
model = Word2Vec(sentences, size=100, window=5, min_count=3, workers=4)
In [14]:
vectors = model.wv
# del model
In [15]:
vectors['good']
Out[15]:
In [16]:
print(vectors.similarity('you', 'your'))
print(vectors.similarity('you', 'internet'))
In [17]:
vectors.most_similar('kill')
Out[17]:
In [18]:
len(model.wv.vocab)
Out[18]:
In [19]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]
# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda k: -k[2])
# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
# print(ordered_terms)
# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(model.wv.syn0norm[term_indices, :], index=ordered_terms)
word_vectors.head(3)
Out[19]:
In [20]:
def get_related_terms(token, topn=10):
"""
look up the topn most similar terms to token
and print them as a formatted list
"""
for word, similarity in model.most_similar(positive=[token], topn=topn):
print (word, round(similarity, 3))
In [21]:
get_related_terms(u'illegal')
In [22]:
get_related_terms(u'killed')
In [23]:
get_related_terms(u'contract')
In [24]:
get_related_terms(u'fired')
In [25]:
def word_algebra(add=[], subtract=[], topn=1):
"""
combine the vectors associated with the words provided
in add= and subtract=, look up the topn most similar
terms to the combined vector, and print the result(s)
"""
answers = model.most_similar(positive=add, negative=subtract, topn=topn)
for term, similarity in answers:
print(term)
In [26]:
word_algebra(add=[u'i', u'will'])
In [27]:
word_algebra(add=[u'you', u'will'])
In [28]:
word_algebra(add=[u'i', u'am'])
In [29]:
word_algebra(add=[u'mother', u'fuck'])
In [ ]:
In [30]:
from sklearn.manifold import TSNE
In [31]:
tsne_input = word_vectors
tsne_input = tsne_input.head(5000)
In [32]:
tsne_input[:2]
Out[32]:
In [33]:
%%time
tsne = TSNE()
tsne_vectors = tsne.fit_transform(tsne_input.values)
In [34]:
tsne_vectors = pd.DataFrame(tsne_vectors,
index=pd.Index(tsne_input.index),
columns=[u'x_coord', u'y_coord'])
tsne_vectors.head()
Out[34]:
In [35]:
tsne_vectors[u'word'] = tsne_vectors.index
In [36]:
tsne_vectors.head()
Out[36]:
In [37]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
output_notebook()
In [38]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)
# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
plot_width = 800,
plot_height = 800,
tools= (u'pan, wheel_zoom, box_zoom,'
u'box_select, reset'),
active_scroll=u'wheel_zoom')
# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )
# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
color=u'blue', line_alpha=0.2, fill_alpha=0.1,
size=10, hover_line_color=u'black')
# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None
# engage!
show(tsne_plot);
In [ ]:
In [ ]:
In [ ]: